{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "db8736a7-ed94-441c-9556-831fa57b5a10",
   "metadata": {},
   "source": [
    "# \"THE PRICE IS RIGHT\" Capstone Project\n",
    "\n",
    "This week - build a model that predicts how much something costs from a description, based on a scrape of Amazon data\n",
    "\n",
    "# Order of play\n",
    "\n",
    "DAY 1: Data Curation  \n",
    "DAY 2: Data Pre-processing  \n",
    "DAY 3: Evaluation, Baselines, Traditional ML  \n",
    "DAY 4: Deep Learning and LLMs  \n",
    "DAY 5: Fine-tuning a Frontier Model  \n",
    "\n",
    "## DAY 5: Fine-tuning a Frontier Model\n",
    "\n",
    "Now we will use OpenAI's API to fine-tune our own private variant of GPT-4.1-nano"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "681c717b-4c24-4ac3-a5f3-3c5881d6e70a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports\n",
    "\n",
    "import os\n",
    "import re\n",
    "import json\n",
    "from dotenv import load_dotenv\n",
    "from huggingface_hub import login\n",
    "from openai import OpenAI\n",
    "from pricer.items  import Item\n",
    "from pricer.evaluator import evaluate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36d05bdc-0155-4c72-a7ee-aa4e614ffd3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# environment\n",
    "\n",
    "LITE_MODE = False\n",
    "\n",
    "load_dotenv(override=True)\n",
    "hf_token = os.environ['HF_TOKEN']\n",
    "login(hf_token, add_to_git_credential=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4f5fb819",
   "metadata": {},
   "outputs": [],
   "source": [
    "username = \"ed-donner\"\n",
    "dataset = f\"{username}/items_lite\" if LITE_MODE else f\"{username}/items_full\"\n",
    "\n",
    "train, val, test = Item.from_hub(dataset)\n",
    "\n",
    "print(f\"Loaded {len(train):,} training items, {len(val):,} validation items, {len(test):,} test items\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0a6fb86-74a4-403c-ab25-6db2d74e9d2b",
   "metadata": {},
   "outputs": [],
   "source": [
    "openai = OpenAI()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1a9493fd",
   "metadata": {},
   "source": [
    "# Data size\n",
    "\n",
    "OpenAI recommends fine-tuning with a small population of 50-100 examples\n",
    "\n",
    "I'm going to go with 20,000 points.\n",
    "\n",
    "This cost me $3.42 - you should stick with 100 examples and the cost will be minimal!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8367135-f40e-43e1-8f3c-09e990ab1194",
   "metadata": {},
   "outputs": [],
   "source": [
    "# OpenAI recommends fine-tuning with populations of 50-100 examples\n",
    "# But as our examples are very small, I'm suggesting we go with 100 examples (and 1 epoch)\n",
    "\n",
    "\n",
    "fine_tune_train = train[:100]\n",
    "fine_tune_validation = val[:50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b614e8c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(fine_tune_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8be4a889-81c3-42b1-a2fc-034cdc7321a6",
   "metadata": {},
   "source": [
    "# Step 1\n",
    "\n",
    "Prepare our data for fine-tuning in JSONL (JSON Lines) format and upload to OpenAI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ae2fb3c-1cff-4ce3-911e-627c970edd7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def messages_for(item):\n",
    "    message = f\"Estimate the price of this product. Respond with the price, no explanation\\n\\n{item.summary}\"\n",
    "    return [\n",
    "        {\"role\": \"user\", \"content\": message},\n",
    "        {\"role\": \"assistant\", \"content\": f\"${item.price:.2f}\"}\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1aa280f6-1227-426a-a2e2-1ce985feba1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "messages_for(fine_tune_train[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0e5b56c-8a0b-4d8e-a112-ce87efb4e152",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert the items into a list of json objects - a \"jsonl\" string\n",
    "# Each row represents a message in the form:\n",
    "# {\"messages\" : [{\"role\": \"system\", \"content\": \"You estimate prices...\n",
    "\n",
    "\n",
    "def make_jsonl(items):\n",
    "    result = \"\"\n",
    "    for item in items:\n",
    "        messages = messages_for(item)\n",
    "        messages_str = json.dumps(messages)\n",
    "        result += '{\"messages\": ' + messages_str +'}\\n'\n",
    "    return result.strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5e72de93-a6a6-4b35-855e-15786b97bf5f",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(make_jsonl(train[:3]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7734bff0-95c4-4e67-a87e-7e2254e2c67d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert the items into jsonl and write them to a file\n",
    "\n",
    "def write_jsonl(items, filename):\n",
    "    with open(filename, \"w\") as f:\n",
    "        jsonl = make_jsonl(items)\n",
    "        f.write(jsonl)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "393d3ad8-999a-4f99-8c04-339d9166d604",
   "metadata": {},
   "outputs": [],
   "source": [
    "write_jsonl(fine_tune_train, \"jsonl/fine_tune_train.jsonl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e23927f-d73e-4668-ac20-abe6f14a56cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "write_jsonl(fine_tune_validation, \"jsonl/fine_tune_validation.jsonl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d59ad8d2-c61a-448e-b7ed-232f1606970f",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"jsonl/fine_tune_train.jsonl\", \"rb\") as f:\n",
    "    train_file = openai.files.create(file=f, purpose=\"fine-tune\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "083fefba-fd54-47ce-9ff3-aabbc200846f",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "97df3360-0760-4422-a556-5f26d23de6dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"jsonl/fine_tune_validation.jsonl\", \"rb\") as f:\n",
    "    validation_file = openai.files.create(file=f, purpose=\"fine-tune\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1abb8f3-9e52-4061-970c-fcf399d8ffa3",
   "metadata": {},
   "outputs": [],
   "source": [
    "validation_file"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cace6000",
   "metadata": {},
   "source": [
    "https://platform.openai.com/storage/files/"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "466052b9-9fb9-48f6-8cf9-c74e6ddc1394",
   "metadata": {},
   "source": [
    "# Step 2\n",
    "\n",
    "## And now time to Fine-tune!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "45421b86-5531-4e42-ab19-d6abbb8f4c13",
   "metadata": {},
   "outputs": [],
   "source": [
    "openai.fine_tuning.jobs.create(\n",
    "    training_file=train_file.id,\n",
    "    validation_file=validation_file.id,\n",
    "    model=\"gpt-4.1-nano-2025-04-14\",\n",
    "    seed=42,\n",
    "    hyperparameters={\"n_epochs\": 1, \"batch_size\": 1},\n",
    "    suffix=\"pricer\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aeb9de2e-542c-4e83-81c7-b6745133e48b",
   "metadata": {},
   "outputs": [],
   "source": [
    "openai.fine_tuning.jobs.list(limit=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "40d24873-8ff5-413f-b0d4-8f77c28f18e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "job_id = openai.fine_tuning.jobs.list(limit=1).data[0].id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1038399b",
   "metadata": {},
   "outputs": [],
   "source": [
    "job_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a7e01247-c133-48e1-93d3-c79c399e6178",
   "metadata": {},
   "outputs": [],
   "source": [
    "openai.fine_tuning.jobs.retrieve(job_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f5150e1-b8de-485f-8eba-cf1e5b00c117",
   "metadata": {},
   "outputs": [],
   "source": [
    "openai.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=10).data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3b889e89",
   "metadata": {},
   "source": [
    "https://platform.openai.com/finetune\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "066fef03-8338-4526-9df3-89b649ad4f0a",
   "metadata": {},
   "source": [
    "# Step 3\n",
    "\n",
    "Test our fine tuned model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa4488cb-3c17-4eda-abd1-53c1c68a491b",
   "metadata": {},
   "outputs": [],
   "source": [
    "fine_tuned_model_name = openai.fine_tuning.jobs.retrieve(job_id).fine_tuned_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9370937-5a6f-4724-8265-b208663b4450",
   "metadata": {},
   "outputs": [],
   "source": [
    "fine_tuned_model_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66ea68e8-ab1b-4f0d-aba4-a59574d8f85e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# The prompt\n",
    "\n",
    "def test_messages_for(item):\n",
    "    message = f\"Estimate the price of this product. Respond with the price, no explanation\\n\\n{item.summary}\"\n",
    "    return [\n",
    "        {\"role\": \"user\", \"content\": message},\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ff92d61-0d27-4b0d-8b32-c9891016509b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Try this out\n",
    "\n",
    "test_messages_for(test[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "501a2a7a-69c8-451b-bbc0-398bcb9e1612",
   "metadata": {},
   "outputs": [],
   "source": [
    "# The inference function\n",
    "\n",
    "\n",
    "def gpt_4__1_nano_fine_tuned(item):\n",
    "    response = openai.chat.completions.create(\n",
    "        model=fine_tuned_model_name,\n",
    "        messages=test_messages_for(item),\n",
    "        max_tokens=7\n",
    "    )\n",
    "    return response.choices[0].message.content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "843d88b4-364a-431b-b48b-8a7c1f68b786",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(test[0].price)\n",
    "print(gpt_4__1_nano_fine_tuned(test[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36bdd2c9-1859-4f99-a09f-3ec83b845b30",
   "metadata": {},
   "outputs": [],
   "source": [
    "evaluate(gpt_4__1_nano_fine_tuned, test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e64d2b20",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 96.58 - mini 200\n",
    "# 79.29 - mini 2000\n",
    "# 82.26 - nano 2000\n",
    "# 67.75 - nano 20,000"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
